eda

Last modified 10:57 AM on May 11, 2016. This document, R session image, version history, knitr cache, figures, and other associated datasets are located in /inside/grotto/blin/trna-markers/luad/predict/.

library(GenomicRanges)
library(RColorBrewer)
library(pheatmap)
library(plyr)
load('/inside/home/blin/grotto/trna-markers/process-reads/luad-counts.RData')
load('/inside/grotto/blin/data/hg19-srnas.RData')

excludeExtraneousGenes = function(counts, threshold = 0.96) {
  cor = cor(t(counts))
  cor[lower.tri(cor)] = NA # remove duplicates (x-y and y-x)
  cor_genes = data.frame(cbind(which(!is.na(cor), arr.ind = TRUE), na.omit(as.vector(cor))))
  cor_genes$row = rownames(cor)[cor_genes$row]
  cor_genes$col = colnames(cor)[cor_genes$col]
  cor_genes = cor_genes[cor_genes$V3 != 1 & cor_genes$V3 > threshold, ]
  if (nrow(cor_genes) == 0) return(rownames(cor))
  extraneous_genes = c()
  ok_genes = c()
  for (i in 1:nrow(cor_genes)) {
    gene1 = cor_genes$row[i]
    gene2 = cor_genes$col[i]
    if (!(gene1 %in% extraneous_genes) & !(gene1 %in% ok_genes)) {
      if (!(gene2 %in% extraneous_genes) & !(gene2 %in% ok_genes)) {
        ok_genes = c(ok_genes, gene1)
        extraneous_genes = c(extraneous_genes, gene2)
      }
      else {
        extraneous_genes = c(extraneous_genes, gene1)
      }
    }
    else {
      extraneous_genes = c(extraneous_genes, gene2)
    }
  }
  ok_genes
}

filterVariance = function(counts, var = 0.5) {
  gene_var = sapply(data.frame(t(counts)), var)
  names(gene_var) = rownames(counts)
  counts[which(gene_var > quantile(gene_var, 0.5)), ]
}

Default metadata for all heatmaps

heatmap_metadata = luad_metadata[luad_metadata$barcode %in% colnames(luad_adjusted_counts), ]
# subtype
load('/inside/grotto/blin/trna-markers/luad/cluster-subtypes/clusters.RData')
clusters$subtype[!clusters$known] = "Unknown"
heatmap_metadata$subtype = as.factor(clusters$subtype[match(substr(heatmap_metadata$barcode, 1, 12), clusters$barcode)])
# stage
levels(heatmap_metadata$stage) = list(I = c("Stage I", "Stage IA", "Stage IB"), II = c("Stage II", "Stage IIA", "Stage IIB"), III = c("Stage III", "Stage IIIA", "Stage IIIB"), IV = "Stage IV", Unknown = '[Not Available]')
# n stage
levels(heatmap_metadata$n_stage) = list(N0 = "N0", N1 = "N1", N2 = "N2", N3 = "N3", Unknown = c("NX", "[Not Available]"))
# smoker
levels(heatmap_metadata$smoker) = list(Smoker = 1, Nonsmoker = 0, Unknown = NA)
# heatmap annotation
levels(heatmap_metadata$sample_type) = list(TP = "TP", NT = "NT")
annot_col = data.frame(stage = heatmap_metadata$stage, subtype = heatmap_metadata$subtype, n_stage = heatmap_metadata$n_stage, sample_type = heatmap_metadata$sample_type, smoker = heatmap_metadata$smoker)
rownames(annot_col) = colnames(luad_adjusted_counts)
annot_row = data.frame(sRNA = as.factor(srnas[match(rownames(luad_adjusted_counts), srnas$tx_name), ]$class))
rownames(annot_row) = rownames(luad_adjusted_counts)
annot_colors = list(stage = setNames(c(brewer.pal(4, "Set1"), "grey"), levels(heatmap_metadata$stage)),
                    subtype = setNames(c(brewer.pal(3, "Set2"), "grey"), levels(heatmap_metadata$subtype)),
                    n_stage = setNames(c(brewer.pal(4, "Set3"), "grey"), levels(heatmap_metadata$n_stage)),
                    sample_type = setNames(c("#EF8A62", "#67A9CF"), levels(heatmap_metadata$sample_type)),
                    smoker = setNames(c("#EF8A62", "#67A9CF", "grey"), levels(heatmap_metadata$smoker)),
                    sRNA = setNames(c(brewer.pal(9, "Paired"), "grey"), levels(annot_row$sRNA)))

Expression heatmap

df = luad_adjusted_counts
df = df[rownames(df) %in% excludeExtraneousGenes(df), ]
pheatmap(df, annotation_col = annot_col, annotation_row = annot_row, annotation_colors = annot_colors, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = FALSE, trace = "none", color = brewer.pal(10, "Reds")) 
plot of chunk expression-heatmap

Fold change heatmaps

Fold change is relative to matched tumor-normal pairs or median normal expression.

getLog2FC = function(counts, metadata) {
  counts = counts - min(counts) + 1
  paired_tp_samples = which(metadata$sample_type == "TP" & metadata$paired)
  log2fc = data.frame(numeric(nrow(counts)))
  median_nt_expression = apply(counts[, paired_tp_samples+1], 1, median)
  for (i in 1:ncol(counts)) {
    sample_metadata = metadata[i, ]
    if (sample_metadata$sample_type == "NT") next
    tp_counts = counts[, i]
    if (sample_metadata$paired) nt_counts = counts[, i+1]
    else nt_counts = median_nt_expression
    log2fc = cbind(log2fc, log2(tp_counts) - log2(nt_counts))
  }
  colnames(log2fc) = metadata[metadata$sample_type == "TP", ]$barcode
  log2fc[, -1]
}
log2fc = getLog2FC(luad_adjusted_counts, luad_metadata)

General sRNA heatmap

metadata = heatmap_metadata[heatmap_metadata$barcode %in% colnames(log2fc), ]
pheatmap(log2fc, annotation_col = annot_col, annotation_row = annot_row, annotation_colors = annot_colors, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = FALSE, trace = "none", color = brewer.pal(10, "RdBu")) 
plot of chunk log2fc-heatmap-1

# filter out genes by correlation
genelist = excludeExtraneousGenes(log2fc)
table(srnas[match(genelist, srnas$tx_name), ]$class)
## 
## genomic-tRF-3 genomic-tRF-5        snoRNA         tRF-1         tRF-3 
##           116           120            21            23            52 
##         tRF-5         tRF-i          tRNA 
##             4            23           194
df = log2fc[rownames(log2fc) %in% genelist, ]

# filter out genes by fold change variance
df = filterVariance(df, 0.5)
# filter metadata
metadata = heatmap_metadata[heatmap_metadata$barcode %in% colnames(df), ]
pheatmap(df, annotation_col = annot_col, annotation_row = annot_row, annotation_colors = annot_colors, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = FALSE, trace = "none", color = brewer.pal(10, "RdBu")) 
plot of chunk log2fc-heatmap-2

MicroRNA heatmap

df = log2fc[srnas[match(rownames(log2fc), srnas$tx_name), ]$class == "miRNA", ]

# filter for gene variance and extraneous genes
df = df[rownames(df) %in% excludeExtraneousGenes(df), ]
df = filterVariance(df, 0.9)
metadata = heatmap_metadata[heatmap_metadata$barcode %in% colnames(df), ]

# heatmap annotation
annot_colors$sRNA = NA
pheatmap(df, annotation_col = annot_col, annotation_colors = annot_colors, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = TRUE, trace = "none", color = brewer.pal(10, "RdBu")) 
plot of chunk mirna-heatmap

tRF heatmap

df = log2fc[srnas[match(rownames(log2fc), srnas$tx_name), ]$class %in% c("tRF-1", "tRF-3", "tRF-5", "tRF-i"), ]

# filter for gene variance and extraneous genes
genelist = excludeExtraneousGenes(df, 0.98)
df = df[rownames(df) %in% genelist, ]
df = filterVariance(df, 0.5)
metadata = heatmap_metadata[heatmap_metadata$barcode %in% colnames(df), ]

# heatmap annotation
annot_row = data.frame(tRF = as.factor(srnas[match(rownames(df), srnas$tx_name), ]$class))
rownames(annot_row) = rownames(df)
annot_colors$sRNA = NA
annot_colors$tRF = setNames(c(brewer.pal(4, "Dark2"), "grey"), levels(annot_row$tRF))
pheatmap(df, annotation_col = annot_col, annotation_row = annot_row, annotation_colors = annot_colors, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = TRUE, trace = "none", color = brewer.pal(10, "RdBu")) 
plot of chunk trf-heatmap

Portable version

pheatmap(df, annotation_col = annot_col, annotation_row = annot_row, annotation_colors = annot_colors, fontsize = 9, scale = "row", show_colnames = FALSE, show_rownames = FALSE, trace = "none", color = brewer.pal(10, "RdBu")) 
plot of chunk trf-heatmap-2

Sort samples by subtype

df2 = df[, metadata[order(metadata$subtype), ]$barcode]
df2 = df2[, metadata[order(metadata$subtype), ]$subtype %in% c("PP", "PI", "TRU")]
annot_col2 = annot_col[colnames(df2), ]
pheatmap(df2, annotation_col = annot_col2, annotation_row = annot_row, annotation_colors = annot_colors, cluster_cols = FALSE, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = FALSE, trace = "none", color = brewer.pal(10, "RdBu"))  
plot of chunk trf-heatmap-3

Sort samples by stage

df2 = df[, metadata[order(metadata$stage), ]$barcode]
df2 = df2[, metadata[order(metadata$stage), ]$stage %in% c("I", "II", "III", "IV")]
annot_col2 = annot_col[colnames(df2), ]
pheatmap(df2, annotation_col = annot_col2, annotation_row = annot_row, annotation_colors = annot_colors, cluster_cols = FALSE, fontsize = 8, scale = "row", show_colnames = FALSE, show_rownames = FALSE, trace = "none", color = brewer.pal(10, "RdBu"))  
plot of chunk trf-heatmap-4

## Saving search path..
## Saving list of loaded packages..
## Saving all data...
## Done.